% This code constructs the raw data for later cleaning.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
clear all
close all
clc

addpath(genpath('../Utility'))

%% 1. MONTHLY DATA
% At monthly frequency we have stock market data and Treasury bill data

%% 1.1 Dividends and Returns
% Use the "CRSP.do" file to generate "D_CRSP.xls"
D_m_file = '../Data/D_CRSP.xls';
D_data = readtable(D_m_file);
        
% Use the "CRSP.do" file to generate "D_idx_CRSP.xls"
Index_file = '../Data/D_idx_CRSP.xls';
Index_data = readtable(Index_file);

% Schwert (1990) Returns
% http://schwert.ssb.rochester.edu/mstock.htm
Schwert_file = '../Data/STKDATM.DAT';
Schwert_matrix = importdata(Schwert_file);
Schwert_data = array2table([floor(Schwert_matrix(: , 1)/100) , mod(Schwert_matrix(: , 1) , 100) ,  Schwert_matrix(: , [2 , 4 , 5])],...
               'VariableNames',{'yyyy', 'mm' , 'Rm_schwert' , 'DY_schwert' , 'Rp_schwert'}); 

Schwert_data.DY_schwert = fillmissing(Schwert_data.DY_schwert , 'linear');
Schwert_data.Rp_schwert(1352 : 1355) = Schwert_data.Rm_schwert(1352 : 1355) - Schwert_data.DY_schwert(1352 : 1355);

%% 1.2 Prices, Dividends, and Inflation from Shiller 
% U.S. Stock Markets 1871-Present and CAPE Ratio 
% http://www.econ.yale.edu/~shiller/data.htm
Shiller_file = '../Data/ie_data.xls';
Shiller_table = table2array(readtable(Shiller_file , 'sheet' , 'Data'));
Shiller_matrix = [floor(Shiller_table(: , 1)) , round((Shiller_table(: , 1) - floor(Shiller_table(: , 1))) * 100) , ...
                  Shiller_table(: , 2 : end)];
Shiller_data = array2table(Shiller_matrix(: , [1 , 2 , 3 , 4 , 6]),...
               'VariableNames',{'yyyy', 'mm' , 'P_S' , 'D_S' , 'cpi'});       

%% 1.3 Treasury Yields
% 3-month Treasury bill (secondary market rate) from FRED
% https://fred.stlouisfed.org/series/TB3MS 
TB3MS_file = '../Data/TB3MS.csv';
TB3MS_table = readtable(TB3MS_file);
TB3MS_data = array2table([year(TB3MS_table.DATE) , month(TB3MS_table.DATE) , TB3MS_table.TB3MS] ,...
             'VariableNames',{'yyyy', 'mm' , 'TB3MS'});
         
% 3-month yield data from Nagel (2016)
TB3SN_file = '../Data/TB3SN.csv';
TB3SN_data = readtable(TB3SN_file);
TB3SN_data.Properties.VariableNames{'Year'} = 'yyyy';
TB3SN_data.Properties.VariableNames{'Month'} = 'mm';
TB3SN_data.Properties.VariableNames{'Tbill3m'} = 'TB3SN';

% 1-year constant maturity yield from FRED 
% https://fred.stlouisfed.org/series/GS1
TB1YCM_file = '../Data/GS1.csv';
TB1YCM_table = readtable(TB1YCM_file);
TB1YCM_data = array2table([year(TB1YCM_table.DATE) , month(TB1YCM_table.DATE) , TB1YCM_table.GS1] ,...
             'VariableNames',{'yyyy', 'mm' , 'TB1YCM'});

%% 1.4 Merge 
T_m_list = {D_data , Index_data ,...
            Shiller_data ,...
            Schwert_data ,...
            TB3MS_data , TB3SN_data , TB1YCM_data};

T_m_temp = T_m_list{1};        
for i = 2 : length(T_m_list)
    T_m_temp = outerjoin(T_m_temp , T_m_list{i} , 'Keys',{'yyyy','mm'} ,'MergeKeys',1);
end

T_m_start_idx = find(T_m_temp.yyyy == 1871 & T_m_temp.mm == 1);
T_m_end_idx = find(T_m_temp.yyyy == 2019 & T_m_temp.mm == 12);

T_m = T_m_temp(T_m_start_idx : T_m_end_idx , :);

%% 2. QUARTERLY DATA
% At quarterly frequency we mainly have consumption, population, and survey
% data

%% 2.1 Survey Expectations
% Use the "SurveyExpectations.do" file to construt "inflexp.csv" and
% "stockretexp.csv"
expinfl_file = '../Data/inflexp.csv';
expinfl_data = readtable(expinfl_file);

expret_file = '../Data/stockretexp.csv';
expret_data = readtable(expret_file);
expret_data = rmmissing(expret_data,'DataVariables',{'yyyy','qtr'});

% Use the "IBESConstruction.do" file to construct "IBES_Forecast.xls"
ibes_file = '../Data/IBES_Forecast.xls';
ibes_data = readtable(ibes_file);

%% 2.2. Population
% Quarterly population from FRED
% https://fred.stlouisfed.org/series/B230RC0Q173SBEA
pop_q_file = '../Data/B230RC0Q173SBEA.csv';
pop_q_table = readtable(pop_q_file);
pop_q_data = array2table([year(pop_q_table.DATE) , quarter(pop_q_table.DATE) , pop_q_table.B230RC0Q173SBEA] ,...
             'VariableNames',{'yyyy', 'qtr' , 'pop_NIPA'});

% Annual population from the Historical Statistics of the United States (Table Aa6-8)
% Mapped to mid-year and interpolated
% https://hsus.cambridge.org  
pop_hist_file = '../Data/CambridgeHistStatPop.xlsx';
pop_hist_table = readtable(pop_hist_file);
pop_hist_q_fill = [pop_hist_table.Pop(1 : end - 1)'; nan(3 , length(pop_hist_table.Pop) - 1)];
pop_hist_q = [pop_hist_q_fill(:); pop_hist_table.Pop(end)];
[yyyy_pop_hist , qtr_pop_hist] = gen_yq_vec(pop_hist_table.Year(1) , 2 , length(pop_hist_q)); 
pop_hist_data = array2table([yyyy_pop_hist , qtr_pop_hist , pop_hist_q] ,...
                'VariableNames',{'yyyy', 'qtr' , 'pop_Cam'});

%% 2.3 Consumption
% Nominal consumption from NIPA Table 2.3.5
C_q_file = '../Data/NIPA2.3.5.csv';
C_q_table = readtable(C_q_file);
nd_q = cellfun(@str2double , table2cell(C_q_table(10 , 3 : end)))';%If string
ser_q = cellfun(@str2double , table2cell(C_q_table(15 , 3 : end)))';
% nd_q = table2array(C_q_table(10 , 3 : end)))';%If double
% ser_q = table2array(C_q_table(15 , 3 : end)))';
[yyyy_c_q , qtr_c_q] = gen_yq_vec(1947 , 1 , length(nd_q));    
C_q_data = array2table([yyyy_c_q , qtr_c_q , nd_q , ser_q] ,...
           'VariableNames',{'yyyy', 'qtr' , 'nd' , 'ser'});

%% 2.4 Quarterly Return Volatility         
% Use the "CRSP.do" file to generate "retvol_CRSP.xls"
retvol_file = '../Data/retvol_CRSP.xls';
retvol_data = readtable(retvol_file);

         
%% 2.5 Merge
T_q_list = {expinfl_data , expret_data , ibes_data ,...
            pop_q_data , pop_hist_data ,...
            retvol_data,...
            C_q_data};

T_q_temp = T_q_list{1};        
for i = 2 : length(T_q_list)
    T_q_temp = outerjoin(T_q_temp , T_q_list{i} , 'Keys',{'yyyy','qtr'} ,'MergeKeys',1);
end

T_q_start_idx = find(T_q_temp.yyyy == 1870 & T_q_temp.qtr == 2);
T_q_end_idx = find(T_q_temp.yyyy == 2019 & T_q_temp.qtr == 4);

T_q = T_q_temp(T_q_start_idx : T_q_end_idx , :);

%% 3. ANNUAL DATA

%% 3.1 Dividend
% Aggregate household dividend receipts from tax data in Piketty et al. (2018), Appendix I, Table TSA6
% http://gabriel-zucman.eu/usdina
D_PSZ_file = '../Data/PSZ2017AppendixTablesI(Macro).xlsx';
D_PSZ_data = readtable(D_PSZ_file , 'sheet' , 'TSA6');
D_PSZ_data = rmmissing(D_PSZ_data(: , {'Var1' , 'x_1_'}));
D_PSZ_data.Properties.VariableNames{'Var1'} = 'yyyy';
D_PSZ_data.Properties.VariableNames{'x_1_'} = 'D_PSZ';

% Aggregate corporate non-farm non-financial dividends from Wright (2004)
% www.ems.bbk.ac.uk/faculty/wright/pdf/Wright2004dataset.xls
D_W_file = '../Data/Wright2004dataset.xls';
D_W_data = readtable(D_W_file , 'sheet' , 'dataset 1900-2002');
D_W_data = rmmissing(D_W_data(: , {'Var1' , 'Var22'}));
D_W_data.Properties.VariableNames{'Var1'} = 'yyyy';
D_W_data.Properties.VariableNames{'Var22'} = 'D_W';

%% 3.2 Consumption
% Per capita real consumption/GDP index from Barro and Ursua (2010)
% https://scholar.harvard.edu/barro/data_sets
Crpc_BU_file = '../Data/barro_ursua_macrodataset_1110.xls';
Crpc_BU_data = readtable(Crpc_BU_file , 'sheet' , 'C');
Crpc_BU_data = rmmissing(Crpc_BU_data(: , {'CPc' , 'UnitedStates'}));
Crpc_BU_data.Properties.VariableNames{'CPc'} = 'yyyy';
Crpc_BU_data.Properties.VariableNames{'UnitedStates'} = 'Crpc_BU';
       
Grpc_BU_data = readtable(Crpc_BU_file , 'sheet' , 'GDP');
Grpc_BU_data = rmmissing(Grpc_BU_data(: , {'GDPPc' , 'UnitedStates'}));
Grpc_BU_data.Properties.VariableNames{'GDPPc'} = 'yyyy';
Grpc_BU_data.Properties.VariableNames{'UnitedStates'} = 'Grpc_BU'; 

% Nominal consumption from NIPA Table 2.3.5
C_a_file = '../Data/NIPA2.3.5 Annual.csv';
C_a_table = readtable(C_a_file);
% nd_a = cellfun(@str2double , table2cell(C_a_table(10 , 3 : end)))';%If string
% ser_a = cellfun(@str2double , table2cell(C_a_table(15 , 3 : end)))';
nd_a = table2array(C_a_table(10 , 3 : end))';%If double
ser_a = table2array(C_a_table(15 , 3 : end))';
C_a_data = array2table([(1929 : 1929 + length(nd_a) - 1)' , nd_a , ser_a] ,...
           'VariableNames',{'yyyy' , 'nd' , 'ser'});    

%% 3. Merge
T_a_list = {D_PSZ_data , D_W_data , ...
            Crpc_BU_data , Grpc_BU_data , ...
            C_a_data};

T_a_temp = T_a_list{1};        
for i = 2 : length(T_a_list)
    T_a_temp = outerjoin(T_a_temp , T_a_list{i} , 'Keys', 'yyyy' ,'MergeKeys',1);
end

T_a_start_idx = find(T_a_temp.yyyy == 1871);
T_a_end_idx = find(T_a_temp.yyyy == 2019);

T_a = T_a_temp(T_a_start_idx : T_a_end_idx , :);

%% 4. Output Data

data_file_name = '../Data/Data_Raw.xlsx';
writetable(T_m , data_file_name,'Sheet','Monthly');
writetable(T_q , data_file_name,'Sheet','Quarterly');
writetable(T_a , data_file_name,'Sheet','Annual');